Model:
ID and
DAY_IDEDA:
TARGET-vars correlationlibrary(readr)
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
library(tibble)
library(Hmisc)
##
## Attachement du package : 'Hmisc'
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## src, summarize
## Les objets suivants sont masqués depuis 'package:base':
##
## format.pval, units
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(PerformanceAnalytics)
## Le chargement a nécessité le package : xts
## Le chargement a nécessité le package : zoo
##
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attachement du package : 'xts'
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## first, last
##
## Attachement du package : 'PerformanceAnalytics'
## L'objet suivant est masqué depuis 'package:graphics':
##
## legend
X_train_raw <- read.csv("../data/X_train.csv")
X_test_raw <- read.csv("../data/X_test.csv")
y_train <- read.csv("../data/y_train.csv")
Shape of the data sets:
print("X train:", )
## [1] "X train:"
dim(X_train_raw)
## [1] 1494 35
print("X test:", )
## [1] "X test:"
dim(X_test_raw)
## [1] 654 35
print("y train:", )
## [1] "y train:"
dim(y_train)
## [1] 1494 2
Are samples’ IDs in X and y aligned? Number of misaligned rows:
sum(X_train_raw$ID != y_train$ID)
## [1] 0
Yes, they are.
Missing values by column. Training set:
tibble_ <- X_train_raw
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
na_cols_train
Test set:
tibble_ <- X_test_raw
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
na_cols_test
Are cols with NAs the same in training and test sets?
print("In train set only:")
## [1] "In train set only:"
print(setdiff(na_cols_train$column_names, na_cols_test$column_names))
## character(0)
print("In test set only:")
## [1] "In test set only:"
print(setdiff(na_cols_test$column_names, na_cols_train$column_names))
## character(0)
Columns’ dtypes. The dtypes of all columns have been properly detected on the data import.
str(X_train_raw)
## 'data.frame': 1494 obs. of 35 variables:
## $ ID : int 1054 2049 1924 297 1101 1520 1546 1069 1323 1618 ...
## $ DAY_ID : int 206 501 687 720 818 467 144 1136 83 307 ...
## $ COUNTRY : chr "FR" "FR" "FR" "DE" ...
## $ DE_CONSUMPTION : num 0.2101 -0.0224 1.395 -0.9833 0.1438 ...
## $ FR_CONSUMPTION : num -0.427 -1.003 1.979 -0.849 -0.617 ...
## $ DE_FR_EXCHANGE : num -0.6065 -0.0221 1.0213 -0.8396 -0.925 ...
## $ FR_DE_EXCHANGE : num 0.6065 0.0221 -1.0213 0.8396 0.925 ...
## $ DE_NET_EXPORT : num NA -0.574 -0.622 -0.271 NA ...
## $ FR_NET_EXPORT : num 0.693 -1.131 -1.683 0.563 0.99 ...
## $ DE_NET_IMPORT : num NA 0.574 0.622 0.271 NA ...
## $ FR_NET_IMPORT : num -0.693 1.131 1.683 -0.563 -0.99 ...
## $ DE_GAS : num 0.441 0.175 2.352 0.488 0.239 ...
## $ FR_GAS : num -0.214 0.427 2.122 0.195 -0.241 ...
## $ DE_COAL : num 0.741 -0.17 1.572 -1.474 1.004 ...
## $ FR_COAL : num 0.289 -0.762 0.777 -0.786 -0.275 ...
## $ DE_HYDRO : num 2.209 0.188 -0.109 -0.368 -0.23 ...
## $ FR_HYDRO : num 0.208 -0.807 0.779 1.32 -0.796 ...
## $ DE_NUCLEAR : num 0.70961 -1.88274 -1.89711 -0.20555 -0.00558 ...
## $ FR_NUCLEAR : num -0.19 -2.186 0.735 -1.59 0.177 ...
## $ DE_SOLAR : num 0.102 1.987 -1.116 1.752 0.694 ...
## $ FR_SOLAR : num 1.249 3.237 -0.371 0.563 0.724 ...
## $ DE_WINDPOW : num -0.5734 -0.0355 -0.2988 -0.0101 -0.7749 ...
## $ FR_WINDPOW : num -0.269 -0.107 -0.141 0.367 -0.564 ...
## $ DE_LIGNITE : num 0.87 -0.194 0.428 -2.331 0.691 ...
## $ DE_RESIDUAL_LOAD: num 0.627 -0.395 1.337 -1.192 0.572 ...
## $ FR_RESIDUAL_LOAD: num -0.445 -1.183 1.947 -0.977 -0.526 ...
## $ DE_RAIN : num -0.173 -1.24 -0.481 -1.115 -0.541 ...
## $ FR_RAIN : num -0.556 -0.77 -0.313 -0.508 -0.425 ...
## $ DE_WIND : num -0.791 1.522 0.431 -0.499 -1.088 ...
## $ FR_WIND : num -0.283 0.828 0.488 -0.236 -1.012 ...
## $ DE_TEMP : num -1.069 0.437 0.685 0.351 0.614 ...
## $ FR_TEMP : num -0.0634 1.8312 0.1148 -0.4175 0.7295 ...
## $ GAS_RET : num 0.339 -0.659 0.536 0.912 0.245 ...
## $ COAL_RET : num 0.1246 0.0471 0.7433 -0.2962 1.5266 ...
## $ CARBON_RET : num -0.00244 -0.49037 0.20495 1.07395 2.61438 ...
COUNTRY var and country label prefixValue counts in the COUNTRY column:
X_train_raw %>%
count(COUNTRY) %>%
mutate(percentage = round(n / sum(n) * 100, 1))
The data describes two countries, FR and DE, roughly equally presented.
Duplicates among DAY_ID in the training set:
X_train_raw %>%
count(DAY_ID) %>%
mutate(n = n - 1) %>%
rename(num_of_duplicates = n) %>%
count(num_of_duplicates) %>%
rename(count = n) %>%
mutate(percentage = round(count / sum(count) * 100, 1))
Most of the lines (3/4) have duplicates.
643*2 + 208 gives the number of rows in the training set.
Duplicates among DAY_ID in the test set:
X_test_raw %>%
count(DAY_ID) %>%
mutate(n = n - 1) %>%
rename(num_of_duplicates = n) %>%
count(num_of_duplicates) %>%
rename(count = n) %>%
mutate(percentage = round(count / sum(count) * 100, 1))
The number of duplicates is more or less the same as in the training set.
For the rows that have the same DAY_ID, in which columns
are they different? Training set:
tibble_ <- X_train_raw
tibble_$TARGET <- y_train$TARGET
# given a DAY_ID, select rows corresponding to it and get the names of the
# columns where the two rows having this DAY_ID differ
non_duplicated_cols <- function(day_id_) {
col_names <- tibble_ %>%
filter(DAY_ID == day_id_) %>%
select_if(function(column) !identical(column[1], column[2])) %>%
names() %>%
paste(., collapse = ", ")
return(col_names)
}
# get the list of day IDs that are duplicated in the table
ids <- tibble_ %>%
count(DAY_ID) %>%
filter(n > 1) %>%
pull(DAY_ID)
# for each pair of duplicated rows, get a list of column names that have
# different values in the pair of duplicated rows, drop duplicates
results <- lapply(ids, non_duplicated_cols) %>% unique()
results
## [[1]]
## [1] "ID, COUNTRY, TARGET"
Same question for the test set (no TARGET):
tibble_ <- X_test_raw
# given a DAY_ID, select rows corresponding to it and get the names of the
# columns where the two rows having this DAY_ID differ
non_duplicated_cols <- function(day_id_) {
col_names <- tibble_ %>%
filter(DAY_ID == day_id_) %>%
select_if(function(column) !identical(column[1], column[2])) %>%
names() %>%
paste(., collapse = ", ")
return(col_names)
}
# get the list of day IDs that are duplicated in the table
ids <- tibble_ %>%
count(DAY_ID) %>%
filter(n > 1) %>%
pull(DAY_ID)
# for each pair of duplicated rows, get a list of column names that have
# different values in the pair of duplicated rows, drop duplicates
results <- lapply(ids, non_duplicated_cols) %>% unique()
results
## [[1]]
## [1] "ID, COUNTRY"
Thus the rows having the same value of DAY_ID differ
only by ID (obviously), COUNTRY (FR or DE) and
TARGET (prediction for FR or DE). For the rest they are
identical.
DAY_ID
duplicates.First select rows that don’t have duplicates:
# get the list of day IDs that are not duplicated in the training set
ids_train <- X_train_raw %>%
count(DAY_ID) %>%
filter(n == 1) %>%
pull(DAY_ID)
# get the list of day IDs that are not duplicated in the test set
ids_test <- X_test_raw %>%
count(DAY_ID) %>%
filter(n == 1) %>%
pull(DAY_ID)
To which countries do these lines in the training and test sets correspond?
print(X_train_raw %>% filter(DAY_ID %in% ids_train) %>% count(COUNTRY))
## COUNTRY n
## 1 FR 208
print(X_test_raw %>% filter(DAY_ID %in% ids_test) %>% count(COUNTRY))
## COUNTRY n
## 1 FR 76
All rows that don’t have a duplicate in DAY_ID
correspond to country FR.
What about missing vals in these non duplicated lines?
tibble_ <- X_train_raw %>% filter(DAY_ID %in% ids_train)
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_train)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_NET_EXPORT 59.6
## 2 DE_NET_IMPORT 59.6
## 3 FR_NET_EXPORT 33.7
## 4 FR_NET_IMPORT 33.7
## 5 DE_FR_EXCHANGE 12
## 6 FR_DE_EXCHANGE 12
tibble_ <- X_test_raw %>% filter(DAY_ID %in% ids_test)
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_test)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_NET_EXPORT 61.8
## 2 DE_NET_IMPORT 61.8
## 3 FR_NET_EXPORT 31.6
## 4 FR_NET_IMPORT 31.6
## 5 DE_FR_EXCHANGE 11.8
## 6 FR_DE_EXCHANGE 11.8
To conclude, the lines that don’t have duplicates in
DAY_ID have NAs in the cols describing countries’ net
import and export and DE<->FR exchange
(DE_FR_EXCHANGE, FR_DE_EXCHANGE,
DE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORT).
And in duplicated lines?
tibble_ <- X_train_raw %>% filter(!(DAY_ID %in% ids_train))
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_train)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_RAIN 7.3
## 2 FR_RAIN 7.3
## 3 DE_WIND 7.3
## 4 FR_WIND 7.3
## 5 DE_TEMP 7.3
## 6 FR_TEMP 7.3
tibble_ <- X_test_raw %>% filter(!(DAY_ID %in% ids_test))
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_test)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_RAIN 6.9
## 2 FR_RAIN 6.9
## 3 DE_WIND 6.9
## 4 FR_WIND 6.9
## 5 DE_TEMP 6.9
## 6 FR_TEMP 6.9
The lines having duplicates in DAY_ID have NAs only in
the cols describing countries’ weather conditions (DE_RAIN,
FR_RAIN, DE_WIND, FR_WIND,
DE_TEMP, FR_TEMP).
The two lists of columns above span all (12) the columns with missing values.
Find the groups of columns that have simultaneously NAs in one row for the training set:
missing_columns_list <- X_train_raw %>%
pmap(function(...) names(list(...))[sapply(list(...), is.na)]) %>%
Filter(function(x) length(x) > 0, .) %>%
unique(.)
missing_columns_list
## [[1]]
## [1] "DE_NET_EXPORT" "DE_NET_IMPORT"
##
## [[2]]
## [1] "DE_FR_EXCHANGE" "FR_DE_EXCHANGE" "DE_NET_EXPORT" "FR_NET_EXPORT"
## [5] "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[3]]
## [1] "DE_NET_EXPORT" "FR_NET_EXPORT" "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[4]]
## [1] "DE_RAIN" "FR_RAIN" "DE_WIND" "FR_WIND" "DE_TEMP" "FR_TEMP"
Same for the test set:
X_test_raw %>%
pmap(function(...) names(list(...))[sapply(list(...), is.na)]) %>%
Filter(function(x) length(x) > 0, .) %>%
unique(.)
## [[1]]
## [1] "DE_NET_EXPORT" "DE_NET_IMPORT"
##
## [[2]]
## [1] "DE_FR_EXCHANGE" "FR_DE_EXCHANGE" "DE_NET_EXPORT" "FR_NET_EXPORT"
## [5] "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[3]]
## [1] "DE_NET_EXPORT" "FR_NET_EXPORT" "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[4]]
## [1] "DE_RAIN" "FR_RAIN" "DE_WIND" "FR_WIND" "DE_TEMP" "FR_TEMP"
Apparently we get the same groups of columns.
As a side note, the analysis above shows that the test set has the same properties as the training set and thus it is a representative sample of the data.
To conclude the analysis above:
DAY_ID col have NAs in all 6 cols describing countries’
weather conditions (DE_RAIN, FR_RAIN,
DE_WIND, FR_WIND, DE_TEMP,
FR_TEMP).DAY_ID col can have NAs simultaneously in the following
groups of cols:
DE_NET_EXPORT, DE_NET_IMPORTDE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORTDE_FR_EXCHANGE, FR_DE_EXCHANGE,
DE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORTExcluding vars ID and DAY_ID useless for
the prediction and setting apart COUNTRY, we are left with
numeric vars only.
Pair corr matrix. I use Spearman corr to capture non-linear corrs.
res <- cor(select(X_train_raw, -ID, -DAY_ID, -COUNTRY), method = "spearman", use = "complete.obs")
res2 <- rcorr(as.matrix(select(X_train_raw, -ID, -DAY_ID, -COUNTRY)), type = "spearman")
Plot:
corrplot(
res,
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.4)
Let’s explore in more details the correlation matrix. For the convenience we will flatten it together with the matrix of p-values.
# Code source: http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software#a-simple-function-to-format-the-correlation-matri
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res2_flat <- flattenCorrMatrix(res2$r, res2$P) %>%
mutate(cor = round(cor, 2), p = round(100 * p, 0)) %>%
arrange(desc(abs(cor)))
res2_flat
First we notice that all correlations with an absolute value greater than 0.07 are significant with p-value < 1%.
Other observations:
DE_FR_EXCHANGE vs FR_DE_EXCHANGE,
DE_NET_EXPORT vs DE_NET_IMPORT,
FR_NET_EXPORT vs FR_NET_IMPORT. Actually one
variable in the pair is the inverse of another. The plot below shows all
pair correlations between these variables. From it , and from the
correlation matrix, we also notice that DE<->FR exchange is
strongly correlated with DE/FR net import/export. At the same time
DE_NET_IMPORT and FR_NET_IMPORT are
decorrelated.chart.Correlation(
select(X_train_raw, DE_FR_EXCHANGE, FR_DE_EXCHANGE, DE_NET_EXPORT, DE_NET_IMPORT, FR_NET_EXPORT, FR_NET_IMPORT),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
chart.Correlation(
select(X_train_raw, FR_DE_EXCHANGE, DE_NET_IMPORT, FR_NET_EXPORT),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between FR/DE export, FR<->DE exchange and FR/DE consumption (total and without renewable).
FR_CONSUMPTION and FR_RESIDUAL_LOAD versus
DE_CONSUMPTION and DE_RESIDUAL_LOAD. Could it
be attributed to the different proportion of the renewable sources in
two countries?FR_DE_EXCHANGE and FR_CONSUMPTION.chart.Correlation(
select(X_train_raw, FR_DE_EXCHANGE, DE_NET_EXPORT, FR_NET_EXPORT, FR_CONSUMPTION, DE_CONSUMPTION, FR_RESIDUAL_LOAD, DE_RESIDUAL_LOAD),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between weather conditions in two countries. Rather strong correlation between temperature and wind in FR and DE, the amount of rain in two countries is essentially unrelated.
chart.Correlation(
select(X_train_raw, DE_RAIN, FR_RAIN, DE_WIND, FR_WIND, DE_TEMP, FR_TEMP),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between weather conditions and production of renewable energy in two countries.
x_WIND and
x_WINDPOW for both countries, as it consisted of two approx
linear relations with different slopes.chart.Correlation(
select(X_train_raw, FR_RAIN, FR_WIND, FR_TEMP, FR_SOLAR, FR_WINDPOW, FR_HYDRO),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
chart.Correlation(
select(X_train_raw, DE_RAIN, DE_WIND, DE_TEMP, DE_SOLAR, DE_WINDPOW, DE_HYDRO),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relations between countries’ consumption rates, exchange between them and whether conditions.
No clear relationship between any of the pair of variables.
chart.Correlation(
select(X_train_raw, FR_CONSUMPTION, FR_RESIDUAL_LOAD, FR_DE_EXCHANGE, FR_NET_EXPORT, FR_RAIN, FR_WIND, FR_TEMP),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Correlation between consumption and production.
chart.Correlation(
select(X_train_raw, FR_CONSUMPTION, FR_RESIDUAL_LOAD, FR_GAS, FR_COAL, FR_NUCLEAR),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
We exclude from the analysis ID, DAY_ID and
perfectly correlated vars DE_FR_EXCHANGE,
FR_NET_IMPORT, DE_NET_IMPORT.
For both countries there is no clear relationship between any of the explanatory variable and target.
For France:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "FR") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for FR"))
print(res)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 124 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 124 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
For Germany:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "DE") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for DE"))
print(res)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Both countries together:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for DE"))
print(res)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 124 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 124 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
For France.
Notice the distribution of x_CONSUMPTION,
x_COAL, x_GAZ and x_NUCLEAR and
the difference in these distribs for the two countries.
# Merge the two tibbles based on a common identifier
merged_tibble <- X_train_raw %>%
filter(COUNTRY == "FR") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Generate hist for each explanatory variable
for (var in colnames(merged_tibble)) {
res <- ggplot(merged_tibble, aes_string(x = var)) +
geom_histogram() +
ggtitle(paste(var, " for FR"))
print(res)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 124 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For Germany only the TARGET distribution is different, all other cols are identical to France:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "DE") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
res <- ggplot() +
geom_histogram(aes(x = y_train[X_train_raw$COUNTRY == "FR", "TARGET"], y = stat(count / sum(count))), fill = "blue", alpha = 0.5) +
geom_histogram(aes(x = y_train[X_train_raw$COUNTRY == "DE", "TARGET"], y = stat(count / sum(count))), fill = "red", alpha = 0.5) +
labs(title = "TARGET for FR and DE", x = "TARGET") +
scale_fill_manual(values = c("blue", "red"), labels = c("TARGET FR", "TARGET DE"))
print(res)
## Warning: `stat(count / sum(count))` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count / sum(count))` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
No clear relation between target values for the same day for the two countries.
rearranged_data <- left_join(X_train_raw, y_train, by = "ID") %>%
select(DAY_ID, COUNTRY, TARGET) %>%
pivot_wider(names_from = COUNTRY, values_from = TARGET, values_fill = NA) %>%
drop_na()
# Display the rearranged table
rearranged_data
res <- ggplot(rearranged_data) +
geom_point(aes(x = FR, y = DE)) +
labs(x = "TARGET FR", y = "TARGET DE")
print(res)
We consider all 12 variables that have missing values and plot a histogram of TARGET values for each of them for the rows with and without missing values. We plot these histograms separately for FR and DE.
First, for FR. Apparently the presence of missing values is not related to the TARGET value.
df <- X_train_raw %>%
filter(COUNTRY == "FR") %>%
select(
FR_DE_EXCHANGE,
FR_NET_EXPORT,
DE_NET_IMPORT,
DE_RAIN,
FR_RAIN,
DE_WIND,
FR_WIND,
DE_TEMP,
FR_TEMP)
# Generate hist for each explanatory variable
for (var in colnames(df)) {
mask <- df %>% select(., all_of(var)) %>% is.na()
res <- ggplot() +
geom_histogram(aes(x = y_train[!mask, "TARGET"], y = stat(count / sum(count))), fill="blue", alpha = 0.5) +
geom_histogram(aes(x = y_train[mask, "TARGET"], y = stat(count / sum(count))), fill="red", alpha = 0.5) +
ggtitle(paste(var, " for FR"))
print(res)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For DE. SAme conclusion: no clear relation between the presence of
missing values and TARGET. Note that the absence of missing values for
FR_DE_EXCHANGE, FR_NET_EXPORT,
DE_NET_IMPORT is in line with the previous analysis (DE
lines always have a duplicate DAY_ID for FR).
df <- X_train_raw %>%
filter(COUNTRY == "DE") %>%
select(
FR_DE_EXCHANGE,
FR_NET_EXPORT,
DE_NET_IMPORT,
DE_RAIN,
FR_RAIN,
DE_WIND,
FR_WIND,
DE_TEMP,
FR_TEMP)
# Generate hist for each explanatory variable
for (var in colnames(df)) {
mask <- df %>% select(., all_of(var)) %>% is.na()
res <- ggplot() +
geom_histogram(aes(x = y_train[!mask, "TARGET"], y = stat(count / sum(count))), fill="blue", alpha = 0.5) +
geom_histogram(aes(x = y_train[mask, "TARGET"], y = stat(count / sum(count))), fill="red", alpha = 0.5) +
ggtitle(paste(var, " for DE"))
print(res)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Heat maps with color code corresponding to the mean TARGET per bin and text - number of examples per bin.
plot_var_interaction <- function(df, col1, col2, country, target = "TARGET", nbins = 5) {
# Create bins and calculate mean TARGET and count for each bin
summary_data <- df %>%
filter(COUNTRY == country) %>%
select(col1, col2, target) %>%
mutate(
col1_bin = cut(!!sym(col1), breaks = nbins),
col2_bin = cut(!!sym(col2), breaks = nbins)
) %>%
group_by(col1_bin, col2_bin) %>%
summarise(mean_target = mean(!!sym(target), na.rm = TRUE), count_target = n())
# Draw the two-dimensional histogram with colored bins
result <- ggplot(summary_data, aes(x = col1_bin, y = col2_bin)) +
geom_tile(aes(fill = mean_target)) +
geom_text(aes(label = count_target), color = "white", size = 4) +
scale_fill_gradient(low = "blue", high = "red") +
labs(x = col1, y = col2)
print(result)
}
There is no visible trend in the interaction between
FR_NUCLEAR and FR_CONSUMPTION:
col1 <- "FR_NUCLEAR"
col2 <- "FR_CONSUMPTION"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(col1)
##
## # Now:
## data %>% select(all_of(col1))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(col2)
##
## # Now:
## data %>% select(all_of(col2))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(target)
##
## # Now:
## data %>% select(all_of(target))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_NUCLEAR"
col2 <- "DE_CONSUMPTION"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
Nor there is any trend in the interaction between
FR_NUCLEAR and FR_RESIDUAL_LOAD:
col1 <- "FR_NUCLEAR"
col2 <- "FR_RESIDUAL_LOAD"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_NUCLEAR"
col2 <- "DE_RESIDUAL_LOAD"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_GAS"
col2 <- "FR_RESIDUAL_LOAD"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_GAS"
col2 <- "DE_RESIDUAL_LOAD"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_COAL"
col2 <- "FR_RESIDUAL_LOAD"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_COAL"
col2 <- "DE_RESIDUAL_LOAD"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_DE_EXCHANGE"
col2 <- "FR_RESIDUAL_LOAD"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_DE_EXCHANGE"
col2 <- "DE_RESIDUAL_LOAD"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_NUCLEAR"
col2 <- "FR_GAS"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_NUCLEAR"
col2 <- "DE_GAS"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_NUCLEAR"
col2 <- "FR_COAL"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_NUCLEAR"
col2 <- "DE_COAL"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
There is not enough stats to evidence the pattern visible in the
interaction between GAS_RET and
CARBON_RET:
col1 <- "GAS_RET"
col2 <- "CARBON_RET"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
Same for the interaction between COAL_RET and
CARBON_RET:
col1 <- "COAL_RET"
col2 <- "CARBON_RET"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
Idem for the interaction between COAL_RET and
GAS_RET:
col1 <- "COAL_RET"
col2 <- "GAS_RET"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
Idem:
col1 <- "FR_GAS"
col2 <- "GAS_RET"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_GAS"
col2 <- "GAS_RET"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
Idem:
col1 <- "FR_COAL"
col2 <- "COAL_RET"
country = "FR"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "DE_COAL"
col2 <- "COAL_RET"
country = "DE"
plot_var_interaction(left_join(X_train_raw, y_train, by = "ID"), col1, col2, country)
## `summarise()` has grouped output by 'col1_bin'. You can override using the
## `.groups` argument.
col1 <- "FR_GAS"
col2 <- "FR_NUCLEAR"
country = "FR"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 1, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
col1 <- "FR_COAL"
col2 <- "FR_NUCLEAR"
country = "FR"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 1, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
col1 <- "DE_GAS"
col2 <- "DE_NUCLEAR"
country = "DE"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 1, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
col1 <- "DE_COAL"
col2 <- "DE_NUCLEAR"
country = "DE"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 1, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
col1 <- "FR_COAL"
col2 <- "COAL_RET"
country = "FR"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 2, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
col1 <- "FR_COAL"
col2 <- "CARBON_RET"
country = "FR"
target <- "TARGET"
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == country) %>%
select(!!sym(col1), !!sym(col2), !!sym(target))
res <- ggplot(merged_tibble) +
geom_point(aes(x = !!sym(col1), y = !!sym(col2), color = !!sym(target)), size = 2, alpha = 0.9) +
scale_color_viridis_c("Taux de chômage", option = "plasma")
print(res)
day_ids <- X_train_raw%>%
filter(COUNTRY == "DE") %>%
pull(DAY_ID)
mask_fr_ids <- (X_train_raw$DAY_ID %in% day_ids) & (X_train_raw$COUNTRY == "FR")
mask_de_ids <- (X_train_raw$DAY_ID %in% day_ids) & (X_train_raw$COUNTRY == "DE")
merged_tibble <- X_train_raw[mask_fr_ids,] %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
merged_tibble$delta_TARGET <- y_train[mask_fr_ids, "TARGET"] - y_train[mask_de_ids, "TARGET"]
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -delta_TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "delta_TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "delta Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. delta Target Variable"))
print(res)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Fill-in NAs:
X_train <- X_train_raw %>% replace(is.na(.), 0)
X_test <- X_test_raw %>% replace(is.na(.), 0)
Drop unused cols. Mind that we use ID and
DAY_ID as explanatory vars.
X_train <- X_train %>% select(-COUNTRY)
X_test <- X_test %>% select(-COUNTRY)
Fit OLS:
lm_ <- lm(y_train$TARGET ~ ., data = X_train)
coef(lm_) %>%
enframe(., name = "Names", value = "Values") %>%
arrange(desc(abs(Values)))
Compare with the OLS model in Py benchmlark notebook:
py_lm_ <- read.csv("../tmp/py_benchmark_model_coeffs.csv")
r_lm_ <- tibble(param = names(coef(lm_)), val = coef(lm_))
# rename Intercept
r_lm_$param[1] = "intercept"
# outer join between two tibbles
# ...
lm_join_ <- full_join(py_lm_, r_lm_, by = "param", suffix = c(".py", ".r"))
# element wise comparison of the models' coefficients
lm_join_$equal <- near(py_lm_$val, r_lm_$val)
lm_join_
Except few variables (DE_FR_EXCHANGE (different),
FR_DE_EXCHANGE (NA), DE_NET_EXPORT
(different),FR_NET_EXPORT (different),
DE_NET_IMPORT(NA), FR_NET_IMPORT (NA)), the
coefficients of the models are the same. For the coefficients that
differ, the reason is that OLS in R has automatically detected highly
correlated variables and excluded them from the model, thus some of the
coefficients are NAs and others are different.
Predict:
y_hat_train <- predict(lm_, X_train)
## Warning in predict.lm(lm_, X_train): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
y_hat_test <- predict(lm_, X_test)
## Warning in predict.lm(lm_, X_test): prediction from rank-deficient fit; attr(*,
## "non-estim") has doubtful cases
Spearman correlation for the predictions on the training set:
cor(y_train$TARGET, y_hat_train, method = "spearman")
## [1] 0.2786787
This correlation value is the same as the one of the Py benchmark model.
Export predictions on the test set:
to_export <- tibble(ID = X_test_raw$ID, TARGET = y_hat_test)
write_csv(to_export, "../data/y_hat_test.csv")